Project: NYC 311 Service Request Analysis¶

Environment Setup¶

In [ ]:
# import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn #visualizing missing data
import plotly.express as px
import plotly.io as pio
# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "notebook"
pio.templates.default = "simple_white"
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
In [ ]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[ ]:
In [ ]:
nyc_df = pd.read_csv("311_Service_Requests_from_2010_to_Present.csv")
nyc_df.head()
Out[ ]:
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Zip Incident Address ... Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Garage Lot Name Ferry Direction Ferry Terminal Name Latitude Longitude Location
0 32310363 12/31/2015 11:59:45 PM 01/01/2016 12:55:15 AM NYPD New York City Police Department Noise - Street/Sidewalk Loud Music/Party Street/Sidewalk 10034.0 71 VERMILYEA AVENUE ... NaN NaN NaN NaN NaN NaN NaN 40.865682 -73.923501 (40.86568153633767, -73.92350095571744)
1 32309934 12/31/2015 11:59:44 PM 01/01/2016 01:26:57 AM NYPD New York City Police Department Blocked Driveway No Access Street/Sidewalk 11105.0 27-07 23 AVENUE ... NaN NaN NaN NaN NaN NaN NaN 40.775945 -73.915094 (40.775945312321085, -73.91509393898605)
2 32309159 12/31/2015 11:59:29 PM 01/01/2016 04:51:03 AM NYPD New York City Police Department Blocked Driveway No Access Street/Sidewalk 10458.0 2897 VALENTINE AVENUE ... NaN NaN NaN NaN NaN NaN NaN 40.870325 -73.888525 (40.870324522111424, -73.88852464418646)
3 32305098 12/31/2015 11:57:46 PM 01/01/2016 07:43:13 AM NYPD New York City Police Department Illegal Parking Commercial Overnight Parking Street/Sidewalk 10461.0 2940 BAISLEY AVENUE ... NaN NaN NaN NaN NaN NaN NaN 40.835994 -73.828379 (40.83599404683083, -73.82837939584206)
4 32306529 12/31/2015 11:56:58 PM 01/01/2016 03:24:42 AM NYPD New York City Police Department Illegal Parking Blocked Sidewalk Street/Sidewalk 11373.0 87-14 57 ROAD ... NaN NaN NaN NaN NaN NaN NaN 40.733060 -73.874170 (40.733059618956815, -73.87416975810375)

5 rows × 53 columns

In [ ]:
#color palette for this notebook
colors = ["#5f8e00","#84c600","#d6ecaa","#1c335e","#2ec7ab","#b9ece3",\
"#e8ac65","#c7006a","#ff1f62","#ffb4cb","#ff8214"]
palette = sns.color_palette(palette = colors)

sns.palplot(palette, size =1)
plt.show()

Basic data exploratory analysis¶

In [ ]:
''' 
It could be seen that, for numerical atrributes: 
`School or Citywide Complaint`, `Verhicle Type` , `Taxi Company Borough`, `Taxi Pick Up Location` and `Garage Lot Name` are all missing values
with their count of row equals to 0 and all other statistical value are NaN.
Hence, we should dropna for these 5 columns.
'''
nyc_df.describe()
Out[ ]:
Unique Key Incident Zip X Coordinate (State Plane) Y Coordinate (State Plane) School or Citywide Complaint Vehicle Type Taxi Company Borough Taxi Pick Up Location Garage Lot Name Latitude Longitude
count 3.645580e+05 361560.000000 3.605280e+05 360528.000000 0.0 0.0 0.0 0.0 0.0 360528.000000 360528.000000
mean 3.106595e+07 10858.496659 1.005043e+06 203425.305782 NaN NaN NaN NaN NaN 40.724980 -73.924946
std 7.331531e+05 578.263114 2.196362e+04 29842.192857 NaN NaN NaN NaN NaN 0.081907 0.079213
min 2.960737e+07 83.000000 9.133570e+05 121185.000000 NaN NaN NaN NaN NaN 40.499040 -74.254937
25% 3.049938e+07 10314.000000 9.919460e+05 182945.000000 NaN NaN NaN NaN NaN 40.668742 -73.972253
50% 3.108795e+07 11209.000000 1.003470e+06 201023.000000 NaN NaN NaN NaN NaN 40.718406 -73.930643
75% 3.167433e+07 11238.000000 1.019134e+06 222790.000000 NaN NaN NaN NaN NaN 40.778166 -73.874098
max 3.231065e+07 11697.000000 1.067186e+06 271876.000000 NaN NaN NaN NaN NaN 40.912869 -73.700715
In [ ]:
nyc_df.describe(include= "object")
Out[ ]:
Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Incident Address Street Name Cross Street 1 ... School State School Zip School Not Found Bridge Highway Name Bridge Highway Direction Road Ramp Bridge Highway Segment Ferry Direction Ferry Terminal Name Location
count 364558 362177 364558 364558 364558 358057 364425 312859 312859 307370 ... 364558 364557 364558 297 297 262 262 1 2 360528
unique 362018 339837 1 3 24 45 18 126372 7693 6234 ... 2 1 1 29 34 2 187 1 2 146751
top 02/11/2015 10:56:36 PM 09/10/2015 07:12:49 AM NYPD New York City Police Department Blocked Driveway No Access Street/Sidewalk 1207 BEACH AVENUE BROADWAY BEND ... Unspecified Unspecified N FDR Dr East/Queens Bound Roadway East 96th St (Exit 14) - Triborough Br (Exit 17) Manhattan Bound St. George Terminal (Staten Island) (40.83036235589997, -73.86602154214397)
freq 3 3 364558 364548 100881 75888 301372 1014 4122 5123 ... 364557 364557 364558 36 26 199 6 1 1 1012

4 rows × 42 columns

In [ ]:
''' 
We have 53 attributes and 364.588 observations.
'''
nyc_df.shape
Out[ ]:
(364558, 53)
In [ ]:
''' 
For object data type, there are 25 columns has null values and 17 completes columns.
'''
nyc_df.select_dtypes('object').isna().any().value_counts()
Out[ ]:
True     25
False    17
Name: count, dtype: int64
In [ ]:
''' 
For numeric data type, there are 10 columns has null values and only 1 completes columns.
'''
nyc_df.select_dtypes('number').isna().any().value_counts()
Out[ ]:
True     10
False     1
Name: count, dtype: int64

Cleaning Missing Values¶

In [ ]:
''' 
We could see many columns are filled with Null Values.
Columns that are totally missing values: Landmark, Bridge Highway Name, Bridge Highway Direction, Road Ramp, Bridge Highway Segment, Ferry Direction Ferry Terminal Name.
--> Total 7 columns 
'''
mn.matrix(nyc_df.select_dtypes(include='object'))
Out[ ]:
<Axes: >
In [ ]:
mn.matrix(nyc_df.select_dtypes(include='number'))
Out[ ]:
<Axes: >
In [ ]:
pio.templates
Out[ ]:
Templates configuration
-----------------------
    Default template: 'simple_white'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']
In [ ]:
'''  
There are 9 columns has the null percentage over 80% 
and 4 columns has the null percentage over 10%
--> we will drop these columns.
'''
columns = []
perc_null = []
for column in nyc_df.select_dtypes(include='object'):
    columns.append(column)
    perc_null.append(round(nyc_df[column].isna().sum()*100/nyc_df.shape[0], 2))

obj_null_calc = pd.DataFrame(list(zip(columns,perc_null)), columns=['Column','Null Percentage'])
#obj_null_calc.head(100)

fig = px.bar(data_frame=obj_null_calc, x='Column', y ='Null Percentage', color='Column', color_discrete_sequence = colors,
             title= "[Object] Null percentage per column", text='Null Percentage')
fig.update_layout(showlegend = False)
fig.update_traces(textposition='inside')
fig.show()
In [ ]:
numeric_columns = []
numeric_perc_null = []

for col in nyc_df.select_dtypes(include= 'number'):
    numeric_columns.append(col)
    numeric_perc_null.append(round(nyc_df[col].isna().sum()*100/nyc_df.shape[0],2))
num_null_calc = pd.DataFrame(list(zip(numeric_columns,numeric_perc_null)), columns=['Columns','Null Percentage'])
fig = px.bar(data_frame=num_null_calc, x='Columns', y='Null Percentage', color='Columns', color_discrete_sequence=colors,
       title='[Numeric] Null percentage per column', text = 'Null Percentage')
fig.update_traces(textposition='inside')
fig.update_layout(showlegend=False)
fig.show()
In [ ]:
'''  
Drop columns with greater then >10% null percentage
=> obj + numberic columns = 18 columns will be dropped
'''

limit_null = nyc_df.shape[0]*0.9
nyc_df_clean = nyc_df.dropna(axis=1, thresh= limit_null)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364558 entries, 0 to 364557
Data columns (total 35 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unique Key                      364558 non-null  int64  
 1   Created Date                    364558 non-null  object 
 2   Closed Date                     362177 non-null  object 
 3   Agency                          364558 non-null  object 
 4   Agency Name                     364558 non-null  object 
 5   Complaint Type                  364558 non-null  object 
 6   Descriptor                      358057 non-null  object 
 7   Location Type                   364425 non-null  object 
 8   Incident Zip                    361560 non-null  float64
 9   Address Type                    361306 non-null  object 
 10  City                            361561 non-null  object 
 11  Facility Type                   362169 non-null  object 
 12  Status                          364558 non-null  object 
 13  Due Date                        364555 non-null  object 
 14  Resolution Description          364558 non-null  object 
 15  Resolution Action Updated Date  362156 non-null  object 
 16  Community Board                 364558 non-null  object 
 17  Borough                         364558 non-null  object 
 18  X Coordinate (State Plane)      360528 non-null  float64
 19  Y Coordinate (State Plane)      360528 non-null  float64
 20  Park Facility Name              364558 non-null  object 
 21  Park Borough                    364558 non-null  object 
 22  School Name                     364558 non-null  object 
 23  School Number                   364558 non-null  object 
 24  School Region                   364557 non-null  object 
 25  School Code                     364557 non-null  object 
 26  School Phone Number             364558 non-null  object 
 27  School Address                  364558 non-null  object 
 28  School City                     364558 non-null  object 
 29  School State                    364558 non-null  object 
 30  School Zip                      364557 non-null  object 
 31  School Not Found                364558 non-null  object 
 32  Latitude                        360528 non-null  float64
 33  Longitude                       360528 non-null  float64
 34  Location                        360528 non-null  object 
dtypes: float64(5), int64(1), object(29)
memory usage: 97.3+ MB
In [ ]:
'''  
Since we have 4 columns described the position of incindent on map, which are lat/long and x/y coordinate.
I would refer to use lat/long then, i will drop these 2 x/y coordinate columns as well as location column (which contains the paired of lat/long).
And I don't need the Incident Zip as well
'''
nyc_df_clean.drop(columns=['X Coordinate (State Plane)','Y Coordinate (State Plane)', 'Location', 'Incident Zip'], inplace=True)
nyc_df_clean.shape
Out[ ]:
(364558, 31)
In [ ]:
mn.dendrogram(nyc_df_clean)
Out[ ]:
<Axes: >
In [ ]:
null_column = pd.Series(nyc_df_clean.isna().sum(), name='Value').to_frame()
null_column
Out[ ]:
Value
Unique Key 0
Created Date 0
Closed Date 2381
Agency 0
Agency Name 0
Complaint Type 0
Descriptor 6501
Location Type 133
Address Type 3252
City 2997
Facility Type 2389
Status 0
Due Date 3
Resolution Description 0
Resolution Action Updated Date 2402
Community Board 0
Borough 0
Park Facility Name 0
Park Borough 0
School Name 0
School Number 0
School Region 1
School Code 1
School Phone Number 0
School Address 0
School City 0
School State 0
School Zip 1
School Not Found 0
Latitude 4030
Longitude 4030
In [ ]:
'''  
For `Descriptor`, `Location Type`, `Address Type`, `City`, `Facility Type` column, i will fill the null with 'Unspecified'
For `Longitude` and `Latitude` , we will drop null rows
For `Closed Date` and `Due Date` and `Resolution Action Updated Date` we will replace the null by plus the created date with the maximum duration to resolved based on each complaint type.
For `School Region`, `School Code`, `School Zip` we will fill it with the mode value.
'''

fill_unspecified_list = ['Descriptor', 'Location Type', 'Address Type', 'City', 'Facility Type']
for col in fill_unspecified_list:
    nyc_df_clean[col] = nyc_df_clean[col].fillna('Unspecified')
In [ ]:
'''  
We could see that all the columns start with 'School' have the top is unspecified. 
And the 'school not found' has only 1 value is 'N' which is not informative.
Hence, we will drop all these columns.
'''
nyc_df_clean.loc[:,nyc_df_clean.columns.str.startswith('School')].describe()
Out[ ]:
School Name School Number School Region School Code School Phone Number School Address School City School State School Zip School Not Found
count 364558 364558 364557 364557 364558 364558 364558 364558 364557 364558
unique 2 2 1 1 2 2 2 2 1 1
top Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified Unspecified N
freq 364557 364557 364557 364557 364557 364557 364557 364557 364557 364558
In [ ]:
nyc_df_clean.drop(columns=nyc_df_clean.loc[:,nyc_df_clean.columns.str.startswith('School')].columns.values, inplace=True)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364558 entries, 0 to 364557
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unique Key                      364558 non-null  int64  
 1   Created Date                    364558 non-null  object 
 2   Closed Date                     362177 non-null  object 
 3   Agency                          364558 non-null  object 
 4   Agency Name                     364558 non-null  object 
 5   Complaint Type                  364558 non-null  object 
 6   Descriptor                      364558 non-null  object 
 7   Location Type                   364558 non-null  object 
 8   Address Type                    364558 non-null  object 
 9   City                            364558 non-null  object 
 10  Facility Type                   364558 non-null  object 
 11  Status                          364558 non-null  object 
 12  Due Date                        364555 non-null  object 
 13  Resolution Description          364558 non-null  object 
 14  Resolution Action Updated Date  362156 non-null  object 
 15  Community Board                 364558 non-null  object 
 16  Borough                         364558 non-null  object 
 17  Park Facility Name              364558 non-null  object 
 18  Park Borough                    364558 non-null  object 
 19  Latitude                        360528 non-null  float64
 20  Longitude                       360528 non-null  float64
dtypes: float64(2), int64(1), object(18)
memory usage: 58.4+ MB
In [ ]:
nyc_df_clean['Longitude'] = nyc_df_clean['Longitude'].fillna(nyc_df_clean['Longitude'].mode()[0])
nyc_df_clean['Latitude'] = nyc_df_clean['Latitude'].fillna(nyc_df_clean['Latitude'].mode()[0])
In [ ]:
nyc_df_clean.loc[:,nyc_df_clean.columns.str.endswith('Date')]
Out[ ]:
Created Date Closed Date Due Date Resolution Action Updated Date
0 12/31/2015 11:59:45 PM 01/01/2016 12:55:15 AM 01/01/2016 07:59:45 AM 01/01/2016 12:55:15 AM
1 12/31/2015 11:59:44 PM 01/01/2016 01:26:57 AM 01/01/2016 07:59:44 AM 01/01/2016 01:26:57 AM
2 12/31/2015 11:59:29 PM 01/01/2016 04:51:03 AM 01/01/2016 07:59:29 AM 01/01/2016 04:51:03 AM
3 12/31/2015 11:57:46 PM 01/01/2016 07:43:13 AM 01/01/2016 07:57:46 AM 01/01/2016 07:43:13 AM
4 12/31/2015 11:56:58 PM 01/01/2016 03:24:42 AM 01/01/2016 07:56:58 AM 01/01/2016 03:24:42 AM
... ... ... ... ...
364553 01/01/2015 12:04:44 AM 01/01/2015 10:22:31 AM 01/01/2015 08:04:44 AM 01/01/2015 10:22:31 AM
364554 01/01/2015 12:04:28 AM 01/01/2015 02:25:02 AM 01/01/2015 08:04:28 AM 01/01/2015 02:25:02 AM
364555 01/01/2015 12:01:30 AM 01/01/2015 12:20:33 AM 01/01/2015 08:01:30 AM 01/01/2015 12:20:33 AM
364556 01/01/2015 12:01:29 AM 01/01/2015 02:42:22 AM 01/01/2015 08:01:29 AM 01/01/2015 02:42:22 AM
364557 01/01/2015 12:00:50 AM 01/01/2015 02:47:50 AM 01/01/2015 08:00:50 AM 01/01/2015 02:47:50 AM

364558 rows × 4 columns

In [ ]:
date_type_column = nyc_df_clean.loc[:, nyc_df_clean.columns.str.endswith('Date')].columns.values
for col in date_type_column:
    nyc_df_clean[col] = pd.to_datetime(nyc_df_clean[col], format='%m/%d/%Y %I:%M:%S %p')
    #nyc_df_clean[col] = nyc_df_clean[col].dt.strftime('%m/%d/%Y %H:%M:%S')

nyc_df_clean.loc[:,nyc_df_clean.columns.str.endswith('Date')]
Out[ ]:
Created Date Closed Date Due Date Resolution Action Updated Date
0 2015-12-31 23:59:45 2016-01-01 00:55:15 2016-01-01 07:59:45 2016-01-01 00:55:15
1 2015-12-31 23:59:44 2016-01-01 01:26:57 2016-01-01 07:59:44 2016-01-01 01:26:57
2 2015-12-31 23:59:29 2016-01-01 04:51:03 2016-01-01 07:59:29 2016-01-01 04:51:03
3 2015-12-31 23:57:46 2016-01-01 07:43:13 2016-01-01 07:57:46 2016-01-01 07:43:13
4 2015-12-31 23:56:58 2016-01-01 03:24:42 2016-01-01 07:56:58 2016-01-01 03:24:42
... ... ... ... ...
364553 2015-01-01 00:04:44 2015-01-01 10:22:31 2015-01-01 08:04:44 2015-01-01 10:22:31
364554 2015-01-01 00:04:28 2015-01-01 02:25:02 2015-01-01 08:04:28 2015-01-01 02:25:02
364555 2015-01-01 00:01:30 2015-01-01 00:20:33 2015-01-01 08:01:30 2015-01-01 00:20:33
364556 2015-01-01 00:01:29 2015-01-01 02:42:22 2015-01-01 08:01:29 2015-01-01 02:42:22
364557 2015-01-01 00:00:50 2015-01-01 02:47:50 2015-01-01 08:00:50 2015-01-01 02:47:50

364558 rows × 4 columns

In [ ]:
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364558 entries, 0 to 364557
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Unique Key                      364558 non-null  int64         
 1   Created Date                    364558 non-null  datetime64[ns]
 2   Closed Date                     362177 non-null  datetime64[ns]
 3   Agency                          364558 non-null  object        
 4   Agency Name                     364558 non-null  object        
 5   Complaint Type                  364558 non-null  object        
 6   Descriptor                      364558 non-null  object        
 7   Location Type                   364558 non-null  object        
 8   Address Type                    364558 non-null  object        
 9   City                            364558 non-null  object        
 10  Facility Type                   364558 non-null  object        
 11  Status                          364558 non-null  object        
 12  Due Date                        364555 non-null  datetime64[ns]
 13  Resolution Description          364558 non-null  object        
 14  Resolution Action Updated Date  362156 non-null  datetime64[ns]
 15  Community Board                 364558 non-null  object        
 16  Borough                         364558 non-null  object        
 17  Park Facility Name              364558 non-null  object        
 18  Park Borough                    364558 non-null  object        
 19  Latitude                        364558 non-null  float64       
 20  Longitude                       364558 non-null  float64       
dtypes: datetime64[ns](4), float64(2), int64(1), object(14)
memory usage: 58.4+ MB
In [ ]:
'''  
We could seen that the complaint type Ferry Complaint has no Due Date and Closed Date and 1 missing Value on Resolution Action Updated Date
So it is not informative for us, we will drop these rows.
'''
display(nyc_df_clean[nyc_df_clean['Complaint Type']=='Ferry Complaint'][['Complaint Type','Created Date', 'Due Date','Closed Date']])
nyc_df_clean[nyc_df_clean['Complaint Type']=='Ferry Complaint'].info()
Complaint Type Created Date Due Date Closed Date
161610 Ferry Complaint 2015-08-03 08:28:29 NaT NaT
192974 Ferry Complaint 2015-07-06 09:12:34 NaT NaT
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 161610 to 192974
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Unique Key                      2 non-null      int64         
 1   Created Date                    2 non-null      datetime64[ns]
 2   Closed Date                     0 non-null      datetime64[ns]
 3   Agency                          2 non-null      object        
 4   Agency Name                     2 non-null      object        
 5   Complaint Type                  2 non-null      object        
 6   Descriptor                      2 non-null      object        
 7   Location Type                   2 non-null      object        
 8   Address Type                    2 non-null      object        
 9   City                            2 non-null      object        
 10  Facility Type                   2 non-null      object        
 11  Status                          2 non-null      object        
 12  Due Date                        0 non-null      datetime64[ns]
 13  Resolution Description          2 non-null      object        
 14  Resolution Action Updated Date  1 non-null      datetime64[ns]
 15  Community Board                 2 non-null      object        
 16  Borough                         2 non-null      object        
 17  Park Facility Name              2 non-null      object        
 18  Park Borough                    2 non-null      object        
 19  Latitude                        2 non-null      float64       
 20  Longitude                       2 non-null      float64       
dtypes: datetime64[ns](4), float64(2), int64(1), object(14)
memory usage: 352.0+ bytes
In [ ]:
nyc_df_clean = nyc_df_clean.drop(nyc_df_clean[(nyc_df_clean['Complaint Type'] == 'Ferry Complaint') & (nyc_df_clean['Due Date'].isnull())].index)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
Index: 364556 entries, 0 to 364557
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Unique Key                      364556 non-null  int64         
 1   Created Date                    364556 non-null  datetime64[ns]
 2   Closed Date                     362177 non-null  datetime64[ns]
 3   Agency                          364556 non-null  object        
 4   Agency Name                     364556 non-null  object        
 5   Complaint Type                  364556 non-null  object        
 6   Descriptor                      364556 non-null  object        
 7   Location Type                   364556 non-null  object        
 8   Address Type                    364556 non-null  object        
 9   City                            364556 non-null  object        
 10  Facility Type                   364556 non-null  object        
 11  Status                          364556 non-null  object        
 12  Due Date                        364555 non-null  datetime64[ns]
 13  Resolution Description          364556 non-null  object        
 14  Resolution Action Updated Date  362155 non-null  datetime64[ns]
 15  Community Board                 364556 non-null  object        
 16  Borough                         364556 non-null  object        
 17  Park Facility Name              364556 non-null  object        
 18  Park Borough                    364556 non-null  object        
 19  Latitude                        364556 non-null  float64       
 20  Longitude                       364556 non-null  float64       
dtypes: datetime64[ns](4), float64(2), int64(1), object(14)
memory usage: 61.2+ MB
In [ ]:
'''  
For the remain row of dude is NaT and resolution action updated date is NaT. We could see that the Closed Date is same day with the Created Date. 
So i will fill NaT for Due Date and Resolution Action with the same Closed Date.
'''
nyc_df_clean[nyc_df_clean['Due Date'].isnull()]
Out[ ]:
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Address Type City ... Status Due Date Resolution Description Resolution Action Updated Date Community Board Borough Park Facility Name Park Borough Latitude Longitude
175921 31129107 2015-07-21 16:05:39 2015-07-21 21:56:38 NYPD NYPD Blocked Driveway No Access Unspecified ADDRESS BROOKLYN ... Draft NaT The Police Department responded to the complai... NaT 10 BROOKLYN BROOKLYN Unspecified BROOKLYN 40.625772 -73.999564

1 rows × 21 columns

In [ ]:
mask = (nyc_df_clean['Due Date'].isnull()) & (nyc_df_clean['Resolution Action Updated Date'].isnull())
nyc_df_clean.loc[mask, ['Due Date', 'Resolution Action Updated Date']] = nyc_df_clean.loc[mask, 'Closed Date']
In [ ]:
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
Index: 364556 entries, 0 to 364557
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Unique Key                      364556 non-null  int64         
 1   Created Date                    364556 non-null  datetime64[ns]
 2   Closed Date                     362177 non-null  datetime64[ns]
 3   Agency                          364556 non-null  object        
 4   Agency Name                     364556 non-null  object        
 5   Complaint Type                  364556 non-null  object        
 6   Descriptor                      364556 non-null  object        
 7   Location Type                   364556 non-null  object        
 8   Address Type                    364556 non-null  object        
 9   City                            364556 non-null  object        
 10  Facility Type                   364556 non-null  object        
 11  Status                          364556 non-null  object        
 12  Due Date                        364556 non-null  datetime64[ns]
 13  Resolution Description          364556 non-null  object        
 14  Resolution Action Updated Date  362156 non-null  datetime64[ns]
 15  Community Board                 364556 non-null  object        
 16  Borough                         364556 non-null  object        
 17  Park Facility Name              364556 non-null  object        
 18  Park Borough                    364556 non-null  object        
 19  Latitude                        364556 non-null  float64       
 20  Longitude                       364556 non-null  float64       
dtypes: datetime64[ns](4), float64(2), int64(1), object(14)
memory usage: 61.2+ MB
In [ ]:
nyc_df_clean[(nyc_df_clean['Closed Date'].isnull()) & (nyc_df_clean['Resolution Action Updated Date'].isnull())]
Out[ ]:
Unique Key Created Date Closed Date Agency Agency Name Complaint Type Descriptor Location Type Address Type City ... Status Due Date Resolution Description Resolution Action Updated Date Community Board Borough Park Facility Name Park Borough Latitude Longitude
416 32305700 2015-12-31 14:16:04 NaT NYPD New York City Police Department Illegal Parking Posted Parking Sign Violation Street/Sidewalk Unspecified Unspecified ... Open 2015-12-31 22:16:04 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
611 32309308 2015-12-31 09:58:06 NaT NYPD New York City Police Department Noise - Street/Sidewalk Loud Music/Party Street/Sidewalk Unspecified Unspecified ... Open 2015-12-31 17:58:06 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
1648 32303348 2015-12-30 05:13:42 NaT NYPD New York City Police Department Illegal Parking Commercial Overnight Parking Street/Sidewalk Unspecified Unspecified ... Open 2015-12-30 13:13:42 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
1816 32294519 2015-12-29 22:44:50 NaT NYPD New York City Police Department Derelict Vehicle With License Plate Street/Sidewalk Unspecified Unspecified ... Open 2015-12-30 06:44:50 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
1965 32296487 2015-12-29 19:09:13 NaT NYPD New York City Police Department Derelict Vehicle With License Plate Street/Sidewalk Unspecified Unspecified ... Open 2015-12-30 03:09:13 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
363643 29618691 2015-01-02 11:38:52 NaT NYPD New York City Police Department Derelict Vehicle With License Plate Street/Sidewalk Unspecified Unspecified ... Open 2015-01-02 19:38:52 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
363789 29619230 2015-01-02 07:25:16 NaT NYPD New York City Police Department Illegal Parking Posted Parking Sign Violation Street/Sidewalk Unspecified Unspecified ... Open 2015-01-02 15:25:16 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
363805 29618672 2015-01-02 06:14:06 NaT NYPD New York City Police Department Derelict Vehicle With License Plate Street/Sidewalk Unspecified Unspecified ... Open 2015-01-02 14:14:06 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
364151 29617561 2015-01-01 16:04:35 NaT NYPD New York City Police Department Illegal Parking Blocked Hydrant Street/Sidewalk Unspecified Unspecified ... Assigned 2015-01-02 00:04:35 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022
364349 29617257 2015-01-01 07:30:10 NaT NYPD New York City Police Department Illegal Parking Posted Parking Sign Violation Street/Sidewalk Unspecified Unspecified ... Open 2015-01-01 15:30:10 Your complaint has been forwarded to the New Y... NaT 0 Unspecified Unspecified Unspecified Unspecified 40.830362 -73.866022

2362 rows × 21 columns

In [ ]:
''' 
We could seen the closed date is nan because the status hasn't closed yet.
So there is not problem with this column.
But the draft status seems to be useless so i will drop that row.
'''
nyc_df_clean.groupby('Status')['Closed Date'].apply(lambda x: x.isnull().sum())
Out[ ]:
Status
Assigned     776
Closed         0
Draft          1
Open        1602
Name: Closed Date, dtype: int64
In [ ]:
nyc_df_clean = nyc_df_clean[nyc_df_clean['Status'] != 'Draft']
nyc_df_clean['Status'].value_counts()
Out[ ]:
Status
Closed      362114
Open          1638
Assigned       802
Name: count, dtype: int64
In [ ]:
nyc_df_clean.groupby('Status')['Resolution Action Updated Date'].apply(lambda x: x.isnull().sum())
Out[ ]:
Status
Assigned     760
Closed         2
Open        1637
Name: Resolution Action Updated Date, dtype: int64
In [ ]:
''' The Resolution Action was missing for closed status. But there were closed date already so i will replace these missing with it closed date'''
nyc_df_clean[(nyc_df_clean['Status'] == 'Closed') & (nyc_df_clean['Resolution Action Updated Date'].isnull())][['Closed Date','Due Date','Complaint Type','Resolution Description']]
Out[ ]:
Closed Date Due Date Complaint Type Resolution Description
175964 2015-07-21 17:24:17 2015-07-21 23:15:37 Illegal Parking The Police Department responded to the complai...
294151 2015-04-05 21:09:42 2015-04-06 04:54:44 Blocked Driveway Your complaint has been forwarded to the New Y...
In [ ]:
mask2 = (nyc_df_clean['Status'] == 'Closed') & (nyc_df_clean['Resolution Action Updated Date'].isnull())
nyc_df_clean.loc[mask2,'Resolution Action Updated Date'] = nyc_df_clean.loc[mask2, 'Closed Date']
In [ ]:
nyc_df_clean['Resolution Description'].value_counts()
Out[ ]:
Resolution Description
The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.                                                                                                                                                                                                                                                                                                                                                                               107051
The Police Department responded to the complaint and took action to fix the condition.                                                                                                                                                                                                                                                                                                                                                                                                                                 72916
The Police Department responded and upon arrival those responsible for the condition were gone.                                                                                                                                                                                                                                                                                                                                                                                                                        71281
The Police Department responded to the complaint and determined that police action was not necessary.                                                                                                                                                                                                                                                                                                                                                                                                                  47612
The Police Department issued a summons in response to the complaint.                                                                                                                                                                                                                                                                                                                                                                                                                                                   37342
The Police Department reviewed your complaint and provided additional information below.                                                                                                                                                                                                                                                                                                                                                                                                                               16040
Your request can not be processed at this time because of insufficient contact information. Please create a new Service Request on NYC.gov and provide more detailed contact information.                                                                                                                                                                                                                                                                                                                               5258
This complaint does not fall under the Police Department's jurisdiction.                                                                                                                                                                                                                                                                                                                                                                                                                                                2113
Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.                                                                                                                                                                                                                                                                                                        2108
The Police Department responded to the complaint but officers were unable to gain entry into the premises.                                                                                                                                                                                                                                                                                                                                                                                                              1549
The Police Department responded to the complaint and a report was prepared.                                                                                                                                                                                                                                                                                                                                                                                                                                              869
Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference.       254
The Police Department made an arrest in response to the complaint.                                                                                                                                                                                                                                                                                                                                                                                                                                                       151
The New York City Police Department received your comments and forwarded them to the appropriate unit for resolution. You may follow up by calling (646) 610-6952 after 60 days from submitting your agency issue.                                                                                                                                                                                                                                                                                                         8
Your complaint has been received by the Police Department and it has been determined that a long-term investigation may be necessary.  Additional information will be available at the conclusion of the investigation.                                                                                                                                                                                                                                                                                                    1
The condition was determined to be an issue appropriate for handling by an alternate entity.  The Department of Parks and Recreation has notified the appropriate resource.                                                                                                                                                                                                                                                                                                                                                1
Name: count, dtype: int64
In [ ]:
'''
The 'Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.'
solution has 100% null values for resolution action updated date.
'''
nyc_df_clean.groupby('Resolution Description')['Resolution Action Updated Date'].apply(lambda x: x.isnull().sum())
Out[ ]:
Resolution Description
The New York City Police Department received your comments and forwarded them to the appropriate unit for resolution. You may follow up by calling (646) 610-6952 after 60 days from submitting your agency issue.                                                                                                                                                                                                                                                                                                       0
The Police Department issued a summons in response to the complaint.                                                                                                                                                                                                                                                                                                                                                                                                                                                     1
The Police Department made an arrest in response to the complaint.                                                                                                                                                                                                                                                                                                                                                                                                                                                       0
The Police Department responded and upon arrival those responsible for the condition were gone.                                                                                                                                                                                                                                                                                                                                                                                                                          8
The Police Department responded to the complaint and a report was prepared.                                                                                                                                                                                                                                                                                                                                                                                                                                              0
The Police Department responded to the complaint and determined that police action was not necessary.                                                                                                                                                                                                                                                                                                                                                                                                                    5
The Police Department responded to the complaint and took action to fix the condition.                                                                                                                                                                                                                                                                                                                                                                                                                                  14
The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.                                                                                                                                                                                                                                                                                                                                                                                  8
The Police Department responded to the complaint but officers were unable to gain entry into the premises.                                                                                                                                                                                                                                                                                                                                                                                                               0
The Police Department reviewed your complaint and provided additional information below.                                                                                                                                                                                                                                                                                                                                                                                                                                 0
The condition was determined to be an issue appropriate for handling by an alternate entity.  The Department of Parks and Recreation has notified the appropriate resource.                                                                                                                                                                                                                                                                                                                                              0
This complaint does not fall under the Police Department's jurisdiction.                                                                                                                                                                                                                                                                                                                                                                                                                                                 0
Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.                                                                                                                                                                                                                                                                                                      2108
Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference.     253
Your complaint has been received by the Police Department and it has been determined that a long-term investigation may be necessary.  Additional information will be available at the conclusion of the investigation.                                                                                                                                                                                                                                                                                                  0
Your request can not be processed at this time because of insufficient contact information. Please create a new Service Request on NYC.gov and provide more detailed contact information.                                                                                                                                                                                                                                                                                                                                0
Name: Resolution Action Updated Date, dtype: int64
In [ ]:
mask3 = nyc_df_clean['Resolution Description'] == 'Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.'
nyc_df_clean.loc[mask3, ['Created Date','Closed Date','Due Date']]
Out[ ]:
Created Date Closed Date Due Date
416 2015-12-31 14:16:04 NaT 2015-12-31 22:16:04
611 2015-12-31 09:58:06 NaT 2015-12-31 17:58:06
1648 2015-12-30 05:13:42 NaT 2015-12-30 13:13:42
1816 2015-12-29 22:44:50 NaT 2015-12-30 06:44:50
1965 2015-12-29 19:09:13 NaT 2015-12-30 03:09:13
... ... ... ...
363643 2015-01-02 11:38:52 NaT 2015-01-02 19:38:52
363789 2015-01-02 07:25:16 NaT 2015-01-02 15:25:16
363805 2015-01-02 06:14:06 NaT 2015-01-02 14:14:06
364151 2015-01-01 16:04:35 NaT 2015-01-02 00:04:35
364349 2015-01-01 07:30:10 NaT 2015-01-01 15:30:10

2108 rows × 3 columns

In [ ]:
'''  
For these 2 resolution description, we will replace it with the created date + 15 minutes
'''

mask4 = (nyc_df_clean['Resolution Description'] == "Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference.")\
& (nyc_df_clean['Closed Date'].notnull())
nyc_df_clean.loc[mask4, ['Created Date','Closed Date','Due Date']]
Out[ ]:
Created Date Closed Date Due Date
294151 2015-04-05 20:54:44 2015-04-05 21:09:42 2015-04-06 04:54:44
In [ ]:
def fill_na_resolution_date(row):
    if pd.isnull(row['Resolution Action Updated Date']) and row['Resolution Description'] in ["Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.",
                                                                                              "Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference."]:
        return row['Created Date'] + pd.Timedelta(minutes=15)
    return row['Resolution Action Updated Date']
In [ ]:
nyc_df_clean['Resolution Action Updated Date'] = nyc_df_clean.apply(fill_na_resolution_date, axis=1)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
Index: 364554 entries, 0 to 364557
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Unique Key                      364554 non-null  int64         
 1   Created Date                    364554 non-null  datetime64[ns]
 2   Closed Date                     362176 non-null  datetime64[ns]
 3   Agency                          364554 non-null  object        
 4   Agency Name                     364554 non-null  object        
 5   Complaint Type                  364554 non-null  object        
 6   Descriptor                      364554 non-null  object        
 7   Location Type                   364554 non-null  object        
 8   Address Type                    364554 non-null  object        
 9   City                            364554 non-null  object        
 10  Facility Type                   364554 non-null  object        
 11  Status                          364554 non-null  object        
 12  Due Date                        364554 non-null  datetime64[ns]
 13  Resolution Description          364554 non-null  object        
 14  Resolution Action Updated Date  364518 non-null  datetime64[ns]
 15  Community Board                 364554 non-null  object        
 16  Borough                         364554 non-null  object        
 17  Park Facility Name              364554 non-null  object        
 18  Park Borough                    364554 non-null  object        
 19  Latitude                        364554 non-null  float64       
 20  Longitude                       364554 non-null  float64       
dtypes: datetime64[ns](4), float64(2), int64(1), object(14)
memory usage: 61.2+ MB
In [ ]:
nyc_df_clean[nyc_df_clean['Resolution Action Updated Date'].isnull()]['Resolution Description'].value_counts()
Out[ ]:
Resolution Description
The Police Department responded to the complaint and took action to fix the condition.                                                     14
The Police Department responded and upon arrival those responsible for the condition were gone.                                             8
The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.     8
The Police Department responded to the complaint and determined that police action was not necessary.                                       5
The Police Department issued a summons in response to the complaint.                                                                        1
Name: count, dtype: int64
In [ ]:
''' For the remain null value of resolution action updated date, we will replace with the max duration between resolution action updated date and created date
group by each resolution description '''

nyc_df_clean['duration'] = nyc_df_clean['Resolution Action Updated Date'] - nyc_df_clean['Created Date']
max_duration = nyc_df_clean.groupby('Resolution Description')['duration'].max()
def fill_na_with_max_duration(row):
    if pd.isnull(row['Resolution Action Updated Date']):
        resolution_desc = row['Resolution Description']
        max_duration_time = max_duration[resolution_desc]
        return row['Created Date'] + max_duration_time
    return row['Resolution Action Updated Date']
In [ ]:
nyc_df_clean['Resolution Action Updated Date'] = nyc_df_clean.apply(fill_na_with_max_duration, axis = 1)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
Index: 364554 entries, 0 to 364557
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype          
---  ------                          --------------   -----          
 0   Unique Key                      364554 non-null  int64          
 1   Created Date                    364554 non-null  datetime64[ns] 
 2   Closed Date                     362176 non-null  datetime64[ns] 
 3   Agency                          364554 non-null  object         
 4   Agency Name                     364554 non-null  object         
 5   Complaint Type                  364554 non-null  object         
 6   Descriptor                      364554 non-null  object         
 7   Location Type                   364554 non-null  object         
 8   Address Type                    364554 non-null  object         
 9   City                            364554 non-null  object         
 10  Facility Type                   364554 non-null  object         
 11  Status                          364554 non-null  object         
 12  Due Date                        364554 non-null  datetime64[ns] 
 13  Resolution Description          364554 non-null  object         
 14  Resolution Action Updated Date  364554 non-null  datetime64[ns] 
 15  Community Board                 364554 non-null  object         
 16  Borough                         364554 non-null  object         
 17  Park Facility Name              364554 non-null  object         
 18  Park Borough                    364554 non-null  object         
 19  Latitude                        364554 non-null  float64        
 20  Longitude                       364554 non-null  float64        
 21  duration                        364518 non-null  timedelta64[ns]
dtypes: datetime64[ns](4), float64(2), int64(1), object(14), timedelta64[ns](1)
memory usage: 64.0+ MB
In [ ]:
''' 
We could seen that the Park Borough has the same value with Borough, so we just need to keep one of them
Moreover, the Park Facility Name has a huge unspecified value, so we will drop this columnn.
'''
print(nyc_df_clean['Park Facility Name'].value_counts())
print((nyc_df_clean['Borough']==nyc_df_clean['Park Borough']).sum())
Park Facility Name
Unspecified                        364553
Alley Pond Park - Nature Center         1
Name: count, dtype: int64
364554
In [ ]:
nyc_df_clean.drop(columns=['Park Borough','Park Facility Name','duration'], inplace=True)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'>
Index: 364554 entries, 0 to 364557
Data columns (total 19 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Unique Key                      364554 non-null  int64         
 1   Created Date                    364554 non-null  datetime64[ns]
 2   Closed Date                     362176 non-null  datetime64[ns]
 3   Agency                          364554 non-null  object        
 4   Agency Name                     364554 non-null  object        
 5   Complaint Type                  364554 non-null  object        
 6   Descriptor                      364554 non-null  object        
 7   Location Type                   364554 non-null  object        
 8   Address Type                    364554 non-null  object        
 9   City                            364554 non-null  object        
 10  Facility Type                   364554 non-null  object        
 11  Status                          364554 non-null  object        
 12  Due Date                        364554 non-null  datetime64[ns]
 13  Resolution Description          364554 non-null  object        
 14  Resolution Action Updated Date  364554 non-null  datetime64[ns]
 15  Community Board                 364554 non-null  object        
 16  Borough                         364554 non-null  object        
 17  Latitude                        364554 non-null  float64       
 18  Longitude                       364554 non-null  float64       
dtypes: datetime64[ns](4), float64(2), int64(1), object(12)
memory usage: 55.6+ MB

Analysis¶

How many agencies here?¶

In [ ]:
''' 
We could see that only one agency value has 3 different Agency Name.
Actually,  the Internal Affairs Bureau ("IAB") was set up as a unit within the NYPD that investigates serious police misconduct as well as police corruption.
Hence, we only have one agency here, it is NYDC or New York City Police Department.
So actually, these 2 columns aren't necessary.
We could drop them but i don't mind to keep them in the dataframe.
'''
nyc_df_clean.groupby('Agency')['Agency Name'].value_counts()
Out[ ]:
Agency  Agency Name                    
NYPD    New York City Police Department    364545
        Internal Affairs Bureau                 8
        NYPD                                    1
Name: count, dtype: int64

How many complaint type are there?¶

In [ ]:
complaint_type_counts = nyc_df_clean['Complaint Type'].value_counts()
fig = px.bar(x = complaint_type_counts.index, y = complaint_type_counts.values, color=complaint_type_counts.index, color_discrete_sequence=colors,
       title='Which one is the most complaint in NY?')
fig.update_layout(showlegend = False, xaxis_title = 'Complaint Type', yaxis_title='Number of Complaint')
In [ ]:
borough_counts = nyc_df_clean.groupby('Borough').size().reset_index(name='Count')
fig = px.pie(borough_counts, values='Count', names='Borough', color_discrete_sequence=colors, title='Complaint Distribution by Borough')
fig.show()
In [ ]:
complaint_counts = nyc_df_clean.groupby(['Borough', 'Complaint Type']).size().reset_index(name='Count')
complaint_counts = complaint_counts.sort_values(by=['Borough', 'Count'], ascending=[True, False])

num_boroughs = len(complaint_counts['Borough'].unique())
fig = make_subplots(rows=3, cols=2, subplot_titles=complaint_counts['Borough'].unique())
color_borough = colors[:len(complaint_counts['Borough'].unique())]

for i, (borough, color) in enumerate(zip(complaint_counts['Borough'].unique(), color_borough), 1):
    data_borough = complaint_counts[complaint_counts['Borough'] == borough]

    fig.add_trace(
        go.Bar(x=data_borough['Complaint Type'], y=data_borough['Count'], name=borough, marker=dict(color=color)),
        row=(i - 1) // 2 + 1, col=(i - 1) % 2 + 1  # Adjust the row and column indices
    )

fig.update_layout(
    height=600 * (num_boroughs/2),  # Adjust the height of the entire figure based on the number of unique Boroughs
    showlegend=False,  # Hide the legend
    title_text="Number of Complaint Types in each Borough (Sorted by Highest Count)",
    title_x=0.5  # Align the main title to the center
)
fig.show()

It looks like the popular type of complaint is blocked driveway and illegal parking. These 2 type of complaints have the highest number of complaint received ticket to NYPD in BRONX, BROOKLYN, QUEENS, AND STATE ISLAND. In MANHATTAN borough, the top 2 issues are all about Noise - Street/Sidewalk and Noise - Commercial.

For Blocked Driveway issue, there is 75% descriptor about 'No Access' Meanwhile, for Illegal parking, most of the complaint is about Posted Parking sign Violation (29%) and Blocked Hydrant (22%)

In [ ]:
blocked_driveway_df = nyc_df_clean[nyc_df_clean['Complaint Type'] == "Blocked Driveway"].groupby('Descriptor').size().reset_index(name = 'Count')
illegal_parking_df = nyc_df_clean[nyc_df_clean['Complaint Type'] == "Illegal Parking"].groupby('Descriptor').size().reset_index(name = 'Count')

fig = make_subplots(rows=1, cols=2, subplot_titles=['Blocked Driveway', 'Illegal Parking'], specs=[[{'type':'domain'}, {'type':'domain'}]])
# Add the pie chart for "Blocked Driveway" to the subplot
fig.add_trace(
    go.Pie(labels=blocked_driveway_df['Descriptor'], values=blocked_driveway_df['Count'], name='Blocked Driveway'),
    row=1, col=1
)
# Add the pie chart for "Illegal Parking" to the subplot
fig.add_trace(
    go.Pie(labels=illegal_parking_df['Descriptor'], values=illegal_parking_df['Count'], name='Illegal Parking'),
    row=1, col=2
)
fig.update_layout(
    title_text="Descriptor Distribution for Blocked Driveway and Illegal Parking",
    title_x=0.5
)
fig.update_traces( marker=dict(colors=colors, line=dict(color='#ffffff', width=1)))
fig.show()

Status and Complaint Type Process¶

In [ ]: